In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
In [2]:
df = pd.read_csv('Amazonon_Bookreads_prepared_data.csv', sep='|', index_col=0)
In [3]:
df_children_teens_books = pd.read_csv('Amazonon_Bookreads_children_teens_data.csv', sep='|', index_col=0)
In [4]:
#Choose columns which might correlate (numerical)
columns_relate = df[['pages', 'rating_value_goodreads', 'rating_count_goodreads', 'rating_value_amazon',\
                     'rating_count_amazon', 'price', 'book_age']]
plt.figure(figsize=(16, 6))

mask = np.triu(np.ones_like(columns_relate.corr(), dtype=bool))
heatmap = sns.heatmap(columns_relate.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='GnBu')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

Reading group¶

  • What reading group books cost more?
  • What reading group books have more pages? Hypothesis is that adults books have more pages.
In [5]:
#Count the books in each reading group
reading_group = df[df['reading_group'] != 'not stated']
sns.catplot(x='reading_group', kind='count', palette='ch:.90', data=reading_group)
Out[5]:
<seaborn.axisgrid.FacetGrid at 0x2b4fd268700>
In [6]:
#Prices of different reading groups books
sns.stripplot(x='reading_group', y='price', palette='autumn', data=reading_group)
Out[6]:
<AxesSubplot:xlabel='reading_group', ylabel='price'>
In [7]:
#Look at the outliers
reading_group.sort_values(by='price', ascending=False).head(2)
Out[7]:
title author description pages published_date publisher lexile_measure grade_level weight rating_value_goodreads ... genre_4_weight genre_5_weight genre_6_weight genre_7_weight genre_8_weight genre_9_weight price book_age reading_group bookedition_new
18 Daughter of Smoke & Bone Laini Taylor Around the world, black hand prints are appear... 418.0 2011-09-27 Little, Brown and Company 850L 10 and up 657.70840 3.99 ... 0.053337 0.050236 0.041297 0.032116 0.026548 0.019926 50.92 11.0 teens 0
498 Tower Lord Anthony Ryan “The blood-song rose with an unexpected tune, ... 602.0 2014-07-01 Ace NaN NaN 852.75296 4.17 ... 0.020952 0.019429 0.015238 0.014095 0.012952 0.011048 48.60 8.0 adults 0

2 rows × 37 columns

In [8]:
#Set nan price for these books
df.at[18, 'price'] = np.nan
df.at[498, 'price'] = np.nan
In [9]:
#Number of pages of different reading groups books
fig = px.box(reading_group, x='reading_group', y='pages', points='all', title='Number of pages of different reading groups books')
fig.show()

Conclusions:

  1. There are more teenagers specified books than children or adults
  2. Hypothesis "Adults books are more expensive" - disproved. Most books cost 5-20 euro. However, teens and abults books can cost up to 39 euro. For teens and abults books there are outliers. These prices turned out to be false, they need to be deleted.

  3. Hypothesis "Adults books have bigger size(number of pages)" - confirmed. Children books have smaller number of pages. Teens and adults books have similar size, though adults books have a bigger number of pages.

Lexile_code¶

using df_children_teens_books

  • Hypothesis is most books have High-Low code (easy to read but higly intresting)
  • Hypothesis is High-Low code books have higher rating
In [10]:
#Check the number of different lexile code books.
sns.catplot(x='lexile_code', kind='count', palette='ch:rot=-.25', data=df_children_teens_books)
Out[10]:
<seaborn.axisgrid.FacetGrid at 0x2b4fd5fe5b0>
In [11]:
#Check High-Low code books raiting in Goodreads and Amazon
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(15,8))

fig.suptitle('Rating for different lexile code books in Goodreads and Amazon')

axes[0].set_title('Goodreads')
axes[1].set_title('Amazon')
palette='ch:rot=.25'

sns.barplot(ax=axes[0], x=df_children_teens_books.lexile_code, y=df_children_teens_books.rating_value_goodreads.values, palette=palette)
sns.barplot(ax=axes[1], x=df_children_teens_books.lexile_code, y=df_children_teens_books.rating_value_amazon.values, palette=palette)
Out[11]:
<AxesSubplot:title={'center':'Amazon'}, xlabel='lexile_code'>

Conclusions:

  1. Hypothesis "Most books have High-Low code (easy to read but higly intresting)" - confirmed
  2. Hypothesis "High-Low have higher raiting" - disproved

Book age¶

  • What is books distribution age-wise among different reading groups?

  • Do older books have higher rating?

In [12]:
#Check the distribution of book_age
sns.displot(reading_group, x='book_age', hue='reading_group', kind='kde', palette='rainbow', fill=True)
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x2b481050610>
In [13]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.book_age, y=df.rating_value_goodreads,
                    mode='markers',
                    name='Goodreads rating'))
fig.add_trace(go.Scatter(x=df.book_age, y=df.rating_value_amazon,
                    mode='markers',
                    name='Amazon rating'))
fig.update_layout(title='Rating in Goodreads and Amazon for different age books',
                   xaxis_title='book age',
                   yaxis_title='rating')
fig.show()

Conclusions:

  • Most books were published 10 years ago. But there are new books for teens and children.
  • Old books and new books have the same rating.

Genre¶

  • What are the most popular genres?
  • What genres are most expensive?
In [14]:
main_genre = df.main_genre.dropna()
fig = px.pie(df, names=main_genre, color_discrete_sequence=px.colors.sequential.Sunset)
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title='Genre composition')
fig.show()
In [15]:
max_prices = df[['main_genre', 'price']].dropna().groupby(['main_genre']).max().reset_index()
mean_prices = df[['main_genre', 'price']].dropna().groupby(['main_genre']).mean().reset_index()

fig = go.Figure(data=[
    go.Bar(name='max price', x=max_prices.main_genre, y=max_prices.price),
    go.Bar(name='mean price', x=mean_prices.main_genre, y=mean_prices.price)
])
# Change the bar mode
fig.update_layout(barmode='overlay', title='Max and mean price for each genre')
fig.show()
In [16]:
goodreads_rating = df[['main_genre', 'rating_value_goodreads']].dropna().groupby(['main_genre']).mean().reset_index()
fig = px.scatter(goodreads_rating, y='rating_value_goodreads', x='main_genre', color='main_genre',
                 title='Rating of each genre according to Goodreads users', text='main_genre')
fig.update_traces(marker_size=15)
fig.show()

Preliminary conclusion:

Spirituality and Mystery Thriller are the most higly rated in Goodreads. Cristian books have the smallest raiting.

However, some genres have only one occurrence in the data set. The second experoment will include only genres which have more than 20 occurrences.

In [17]:
#Choose genres which occurr in the data set more than 20 times
most_genres1 = df[['main_genre', 'rating_value_goodreads']].dropna().groupby(['main_genre']).count()\
.sort_values(by='rating_value_goodreads', ascending=True).reset_index()
most_genres2 = most_genres1[most_genres1['rating_value_goodreads'] > 20]
In [18]:
#Create a list of these genres
genres_list = most_genres2['main_genre'].to_list()
In [19]:
#Create a new dataframe which contains only 20+ occurrences of genres
goodreads_rating_top = goodreads_rating[goodreads_rating['main_genre'].isin(genres_list)]
In [20]:
fig = px.scatter(goodreads_rating_top, y='rating_value_goodreads', x='main_genre', color='main_genre',
                 title='Rating of most popular genres according to Goodreads users', text='main_genre')
fig.update_traces(marker_size=15)
fig.show()

Conclusion:

Paranormal books have the biggest raiting while Horror books have the smallest. It is intresting that Nonefiction (the most popular genre) has average rating 4.

Number of pages¶

  • Do longer books cost more?
In [21]:
#Find relations between number of pages and price
sns.relplot(x='pages', y='price', kind='scatter', aspect=3, data=df).set(title='Number of pages and price correlation')
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x2b481085c40>

Conclusion:

Books size(its number of pages) and price - no correlation

First edition¶

  • Do first edition books cost more?
In [22]:
sns.violinplot(x='bookedition_new', y='price', data=df, palette='ch:rot=-.40', gridsize=80)\
.set(title='Book edition and price correlation')
Out[22]:
[Text(0.5, 1.0, 'Book edition and price correlation')]

Conclusion:

Hypothesis 'First editiom books cost more' - disproved

In [ ]: